import pandas as pd
import numpy as np
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
from sklearn.metrics import adjusted_rand_score
from scipy import stats
from kmodes.kmodes import KModes
student_no = 3914796
#read in the data and create a dataframe
file = '/Users/gretastojanovic/Desktop/DS/Python/RMIT_Python/Assignment_2/A2_HR_Employee_Data.csv'
raw = pd.read_csv(file)
df = pd.DataFrame(raw)
df.info() #get overview of the data
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1470 entries, 0 to 1469 Data columns (total 22 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 EmployeeID 1470 non-null int64 1 Age 1470 non-null int64 2 Resigned 1470 non-null object 3 BusinessTravel 1470 non-null object 4 BusinessUnit 1470 non-null object 5 EducationLevel 1470 non-null int64 6 Gender 1470 non-null object 7 JobSatisfaction 1470 non-null int64 8 MaritalStatus 1470 non-null object 9 MonthlyIncome 1470 non-null int64 10 NumCompaniesWorked 1470 non-null int64 11 OverTime 1470 non-null object 12 PercentSalaryHike 1470 non-null int64 13 PerformanceRating 1470 non-null int64 14 AverageWeeklyHoursWorked 1470 non-null float64 15 TotalWorkingYears 1470 non-null int64 16 TrainingTimesLastYear 1470 non-null int64 17 WorkLifeBalance 1470 non-null int64 18 YearsAtCompany 1470 non-null int64 19 YearsInRole 1470 non-null int64 20 YearsSinceLastPromotion 1470 non-null int64 21 YearsWithCurrManager 1470 non-null int64 dtypes: float64(1), int64(15), object(6) memory usage: 252.8+ KB
Figure 1
#plot frequency of all variables
sns.set(style="dark")
fig,axs = plt.subplots(7,3, figsize = (15,12))
sns.histplot(data=df, x="EmployeeID", kde=True, color="skyblue", ax=axs[0, 0])
sns.histplot(data=df, x="Age", kde=True, color="skyblue", ax=axs[0, 0])
sns.histplot(data=df, x="EducationLevel", kde=True, color="olive", ax=axs[0, 1])
sns.histplot(data=df, x="JobSatisfaction", kde=True, color="gold", ax=axs[0, 2])
sns.histplot(data=df, x="MonthlyIncome", kde=True, color="teal", ax=axs[1, 0])
sns.histplot(data=df, x="NumCompaniesWorked", kde=True, color="teal", ax=axs[1, 1])
sns.histplot(data=df, x="PercentSalaryHike", kde=True, color="blue", ax=axs[1, 2])
sns.histplot(data=df, x="PerformanceRating", kde=True, color="purple", ax=axs[2,0])
sns.histplot(data=df, x="AverageWeeklyHoursWorked", kde=True, color="skyblue", ax=axs[2, 1])
sns.histplot(data=df, x="TotalWorkingYears", kde=True, color="olive", ax=axs[2, 2])
sns.histplot(data=df, x="TrainingTimesLastYear", kde=True, color="gold", ax=axs[3, 0])
sns.histplot(data=df, x="WorkLifeBalance", kde=True, color="teal", ax=axs[3, 1])
sns.histplot(data=df, x="YearsAtCompany", kde=True, color="teal", ax=axs[3,2])
sns.histplot(data=df, x="YearsInRole", kde=True, color="blue", ax=axs[4, 0])
sns.histplot(data=df, x="YearsSinceLastPromotion", kde=True, color="purple", ax=axs[4,1])
sns.histplot(data=df, x="YearsWithCurrManager", kde=True, color="skyblue", ax=axs[4, 2])
sns.histplot(data=df, x="Resigned", kde=True, color="skyblue", ax=axs[5, 0])
sns.histplot(data=df, x="BusinessTravel", kde=True, color="olive", ax=axs[5, 1])
sns.histplot(data=df, x="BusinessUnit", kde=True, color="gold", ax=axs[5, 2])
sns.histplot(data=df, x="Gender", kde=True, color="teal", ax=axs[6, 0])
sns.histplot(data=df, x="MaritalStatus", kde=True, color="blue", ax=axs[6, 1])
sns.histplot(data=df, x="OverTime", kde=True, color="purple", ax=axs[6,2])
fig.tight_layout()
df.describe()
| EmployeeID | Age | EducationLevel | JobSatisfaction | MonthlyIncome | NumCompaniesWorked | PercentSalaryHike | PerformanceRating | AverageWeeklyHoursWorked | TotalWorkingYears | TrainingTimesLastYear | WorkLifeBalance | YearsAtCompany | YearsInRole | YearsSinceLastPromotion | YearsWithCurrManager | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 |
| mean | 5491.933333 | 36.923810 | 2.912925 | 2.728571 | 6502.931293 | 2.693197 | 15.209524 | 3.153741 | 43.042177 | 11.279592 | 2.799320 | 2.761224 | 7.008163 | 4.229252 | 2.187755 | 4.123129 |
| std | 2573.640304 | 9.135373 | 1.024165 | 1.102846 | 4707.956783 | 2.498009 | 3.659938 | 0.360824 | 5.389864 | 7.780782 | 1.289271 | 0.706476 | 6.126525 | 3.623137 | 3.222430 | 3.568136 |
| min | 1006.000000 | 18.000000 | 1.000000 | 1.000000 | 1009.000000 | 0.000000 | 11.000000 | 3.000000 | 40.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 3286.250000 | 30.000000 | 2.000000 | 2.000000 | 2911.000000 | 1.000000 | 12.000000 | 3.000000 | 40.000000 | 6.000000 | 2.000000 | 2.000000 | 3.000000 | 2.000000 | 0.000000 | 2.000000 |
| 50% | 5484.500000 | 36.000000 | 3.000000 | 3.000000 | 4919.000000 | 2.000000 | 14.000000 | 3.000000 | 40.000000 | 10.000000 | 3.000000 | 3.000000 | 5.000000 | 3.000000 | 1.000000 | 3.000000 |
| 75% | 7766.250000 | 43.000000 | 4.000000 | 4.000000 | 8379.000000 | 4.000000 | 18.000000 | 3.000000 | 47.000000 | 15.000000 | 3.000000 | 3.000000 | 9.000000 | 7.000000 | 3.000000 | 7.000000 |
| max | 9991.000000 | 60.000000 | 5.000000 | 4.000000 | 19999.000000 | 9.000000 | 25.000000 | 4.000000 | 71.000000 | 40.000000 | 6.000000 | 4.000000 | 40.000000 | 18.000000 | 15.000000 | 17.000000 |
#get numeric columns
numerics = list(df.select_dtypes(['int64', 'float64']).columns)
# create numeric dataframe
numerics = df[numerics]
numerics = pd.DataFrame(numerics)
#remove outliers
#numerics = numerics[(np.abs(stats.zscore(numerics)) < 3).all(axis=1)]
Q1 = numerics.quantile(0.25)
Q3 = numerics.quantile(0.75)
IQR = Q3 - Q1
numerics = numerics[~((numerics < (Q1 - 1.5 * IQR)) |(numerics > (Q3 + 1.5 * IQR))).any(axis=1)]
#get object columns
objectcols = list(df.select_dtypes(['object']).columns)
objectcols.append('EmployeeID')
#create dataframe from object columns
objectdf = df[objectcols]
objectdf = pd.DataFrame(objectdf)
newdf = objectdf.merge(numerics, how='inner', on='EmployeeID')
onehot = pd.get_dummies(newdf)
Figure 2
#plot correlation matrix
plt.figure(figsize = (30,30))
corrmat = onehot.corr()
sns.heatmap(corrmat, annot = True, square = True, cmap = 'YlGnBu')
plt.show()
Figure 3
# show top 10 correlations with Resigned variable
corr = onehot.corr()
corr = pd.DataFrame(corr)
#corr['Resigned_Yes'] = abs(corr['Resigned_Yes'].values)
res_yes = pd.DataFrame(corr['Resigned_Yes'])
res_yes['abs'] = abs(res_yes['Resigned_Yes'].values)
res_yes.sort_values(by = 'abs', ascending = False).head(14)
| Resigned_Yes | abs | |
|---|---|---|
| Resigned_Yes | 1.000000 | 1.000000 |
| Resigned_No | -1.000000 | 1.000000 |
| AverageWeeklyHoursWorked | 0.305732 | 0.305732 |
| OverTime_No | -0.220164 | 0.220164 |
| OverTime_Yes | 0.220164 | 0.220164 |
| YearsAtCompany | -0.191745 | 0.191745 |
| TotalWorkingYears | -0.187664 | 0.187664 |
| YearsInRole | -0.172654 | 0.172654 |
| Age | -0.168777 | 0.168777 |
| YearsWithCurrManager | -0.167105 | 0.167105 |
| MaritalStatus_Single | 0.163453 | 0.163453 |
| MonthlyIncome | -0.151037 | 0.151037 |
| BusinessTravel_Travel_Frequently | 0.123242 | 0.123242 |
| BusinessUnit_Consultants | -0.104012 | 0.104012 |
res_yes_df = df[['Resigned', 'AverageWeeklyHoursWorked', 'OverTime', 'YearsAtCompany',
'TotalWorkingYears', 'YearsInRole', 'Age', 'YearsWithCurrManager',
'MaritalStatus', 'MonthlyIncome', 'BusinessTravel', 'BusinessUnit']]
res_yes_df_onehot = onehot[['Resigned_Yes', 'AverageWeeklyHoursWorked', 'OverTime_Yes', 'YearsAtCompany',
'TotalWorkingYears', 'YearsInRole', 'Age', 'YearsWithCurrManager',
'MaritalStatus_Single', 'MonthlyIncome', 'BusinessTravel_Travel_Frequently', 'BusinessUnit_Consultants']]
res_yes_df_data = res_yes_df.drop(['Resigned'], axis =1)
res_yes_df_data = pd.DataFrame(res_yes_df_data)
Figure 4
# pairplot for numeric columns
sns.pairplot(res_yes_df, hue='Resigned')
plt.show()
Figure 5
#pairplot for all columns via one hot encoding
sns.pairplot(res_yes_df_onehot, hue='Resigned_Yes')
plt.show()
def plot_elbow_graph(dataset, k_range=range(1, 11)):
# A list holds the inertia values for each k
inertia_lst = []
for k in k_range:
km_model = KMeans(n_clusters=k, random_state=student_no)
km_model.fit(dataset)
inertia_lst.append(km_model.inertia_)
plt.figure(figsize=(10,6))
plt.plot(k_range, inertia_lst)
plt.xticks(k_range)
plt.xlabel("Number of Clusters")
plt.ylabel("Inertia")
plt.show()
def plot_silhouette_graph(dataset, k_range=range(2, 11)):
# A list holds the silhouette coefficients for each k
silhouette_coefficients = []
for k in k_range:
km_model = KMeans(n_clusters=k, random_state=student_no)
km_model.fit(dataset)
score = silhouette_score(dataset, km_model.labels_)
silhouette_coefficients.append(score)
plt.figure(figsize=(10,6))
plt.plot(k_range, silhouette_coefficients)
plt.xticks(k_range)
plt.xlabel("Number of Clusters")
plt.ylabel("Silhouette Coefficient")
plt.show()
#use the one hot encoded version of the data to obtain an elbow
#graph result (data must be numeric)
plot_elbow_graph(res_yes_df_onehot, k_range=range(2, 20))
plot_silhouette_graph(res_yes_df_onehot, k_range=range(2, 20))
#plotting the Resigned variable as a histogram
fig = hist = px.histogram(df, x = "Resigned",
labels = {"Resigned": "Resignation Status",
"count": "Number of Employees"},
title = "237 of 1470 Employees have Resigned (16%)")
fig.show()
km = KModes(n_clusters=3, init='Huang', n_init=5, verbose=1)
clusters = km.fit_predict(res_yes_df_data)
clusters = km.fit_predict(res_yes_df_data)
km_df = res_yes_df.copy()
# add a new column to the df and assign it the clustes labels
km_df['cluster'] = clusters
# Print the cluster centroids
print(km.cluster_centroids_)
print(km_df.loc[:,['cluster','Resigned']].value_counts().sort_index())
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 1, iteration: 1/100, moves: 624, cost: 8935.0
Run 1, iteration: 2/100, moves: 198, cost: 8914.0
Run 1, iteration: 3/100, moves: 9, cost: 8913.0
Run 1, iteration: 4/100, moves: 0, cost: 8913.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 2, iteration: 1/100, moves: 374, cost: 9241.0
Run 2, iteration: 2/100, moves: 112, cost: 9226.0
Run 2, iteration: 3/100, moves: 142, cost: 9076.0
Run 2, iteration: 4/100, moves: 46, cost: 9076.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 3, iteration: 1/100, moves: 223, cost: 9295.0
Run 3, iteration: 2/100, moves: 2, cost: 9295.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 4, iteration: 1/100, moves: 312, cost: 8950.0
Run 4, iteration: 2/100, moves: 252, cost: 8950.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 5, iteration: 1/100, moves: 227, cost: 9154.0
Run 5, iteration: 2/100, moves: 25, cost: 9154.0
Best run was number 1
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 1, iteration: 1/100, moves: 428, cost: 8943.0
Run 1, iteration: 2/100, moves: 75, cost: 8942.0
Run 1, iteration: 3/100, moves: 0, cost: 8942.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 2, iteration: 1/100, moves: 435, cost: 9195.0
Run 2, iteration: 2/100, moves: 411, cost: 8917.0
Run 2, iteration: 3/100, moves: 114, cost: 8917.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 3, iteration: 1/100, moves: 573, cost: 8986.0
Run 3, iteration: 2/100, moves: 183, cost: 8986.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 4, iteration: 1/100, moves: 324, cost: 9011.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 5, iteration: 1/100, moves: 395, cost: 8946.0
Run 5, iteration: 2/100, moves: 37, cost: 8946.0
Best run was number 2
[['40.0' 'No' '5' '6' '2' '31' '2' 'Married' '2342' 'Travel_Rarely'
'Consultants']
['40.0' 'No' '1' '1' '0' '35' '0' 'Married' '2559' 'Travel_Rarely'
'Consultants']
['40.0' 'No' '10' '10' '7' '34' '7' 'Single' '5343' 'Travel_Rarely'
'Sales']]
cluster Resigned
0 No 775
Yes 107
1 No 195
Yes 80
2 No 263
Yes 50
dtype: int64
km_df[km_df['cluster'] == 1].describe()
| AverageWeeklyHoursWorked | YearsAtCompany | TotalWorkingYears | YearsInRole | Age | YearsWithCurrManager | MonthlyIncome | cluster | |
|---|---|---|---|---|---|---|---|---|
| count | 275.000000 | 275.000000 | 275.000000 | 275.000000 | 275.000000 | 275.000000 | 275.000000 | 275.0 |
| mean | 43.680000 | 2.745455 | 8.643636 | 1.221818 | 35.254545 | 0.898182 | 5252.650909 | 1.0 |
| std | 5.853391 | 4.446019 | 8.031207 | 2.941938 | 9.870730 | 2.460750 | 4431.200704 | 0.0 |
| min | 40.000000 | 0.000000 | 0.000000 | 0.000000 | 18.000000 | 0.000000 | 1009.000000 | 1.0 |
| 25% | 40.000000 | 1.000000 | 1.000000 | 0.000000 | 29.000000 | 0.000000 | 2389.000000 | 1.0 |
| 50% | 40.000000 | 1.000000 | 7.000000 | 0.000000 | 35.000000 | 0.000000 | 3038.000000 | 1.0 |
| 75% | 48.500000 | 1.000000 | 15.000000 | 0.000000 | 41.000000 | 0.000000 | 6645.000000 | 1.0 |
| max | 67.000000 | 33.000000 | 34.000000 | 15.000000 | 60.000000 | 15.000000 | 19613.000000 | 1.0 |
from kmodes.kprototypes import KPrototypes
#dataframe to an array
smart_array = res_yes_df.values
#converting numerical columns datatype as float
smart_array[:, 3] = smart_array[:,3].astype(float)
smart_array[:, 4] = smart_array[:,4].astype(float)
smart_array[:, 5] = smart_array[:,5].astype(float)
smart_array[:, 6] = smart_array[:,6].astype(float)
smart_array[:, 7] = smart_array[:,7].astype(float)
smart_array[:, 9] = smart_array[:,9].astype(float)
#index of categorical columns
categorical_index = [0] + [2] + [8] + [10, 11]
print('done')
done
# Function for plotting elbow curve
def plot_elbow_curve(start, end, data):
no_of_clusters = list(range(start, end+1))
cost_values = []
for k in no_of_clusters:
test_model = KPrototypes(n_clusters=k, init='Huang',
random_state=student_no)
test_model.fit_predict(data, categorical=categorical_index)
cost_values.append(test_model.cost_)
sns.set_theme(style="whitegrid", palette="bright", font_scale=1.2)
plt.figure(figsize=(15, 7))
ax = sns.lineplot(x=no_of_clusters, y=cost_values, marker="o",
dashes=False)
ax.set_title('Elbow curve', fontsize=18)
ax.set_xlabel('No of clusters', fontsize=14)
ax.set_ylabel('Cost', fontsize=14)
ax.set(xlim=(start-0.1, end+0.1))
plt.plot()
# Plotting elbow curve for k=2 to k=10
plot_elbow_curve(2,10,smart_array)
model_6 = KPrototypes(n_clusters=6, init='Huang', random_state=student_no,
n_jobs=-1)
model_6.fit_predict(smart_array, categorical=categorical_index)
print(model_6.cost_)
#new column for cluster labels associated with each subject
res_yes_df['labels6'] = model_6.labels_
res_yes_df.head()
748428046.2954373
/var/folders/bg/9bc3wg_14jl4dwfytmh7brfh0000gn/T/ipykernel_22464/2095390600.py:6: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
| Resigned | AverageWeeklyHoursWorked | OverTime | YearsAtCompany | TotalWorkingYears | YearsInRole | Age | YearsWithCurrManager | MaritalStatus | MonthlyIncome | BusinessTravel | BusinessUnit | labels6 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Yes | 63.0 | Yes | 6 | 8 | 4 | 41 | 5 | Single | 5993 | Travel_Rarely | Sales | 4 |
| 1 | No | 40.0 | No | 10 | 10 | 7 | 49 | 7 | Married | 5130 | Travel_Frequently | Consultants | 0 |
| 2 | Yes | 50.0 | Yes | 0 | 7 | 0 | 37 | 0 | Single | 2090 | Travel_Rarely | Consultants | 1 |
| 3 | No | 48.0 | Yes | 8 | 8 | 7 | 33 | 0 | Married | 2909 | Travel_Frequently | Consultants | 1 |
| 4 | No | 40.0 | No | 2 | 6 | 2 | 27 | 2 | Married | 3468 | Travel_Rarely | Consultants | 1 |
res_yes_df.loc[:,['labels6','Resigned']].value_counts().sort_index().to_frame().rename({0:'count'}, axis=1)
| count | ||
|---|---|---|
| labels6 | Resigned | |
| 0 | No | 343 |
| Yes | 45 | |
| 1 | No | 361 |
| Yes | 128 | |
| 2 | No | 124 |
| Yes | 5 | |
| 3 | No | 138 |
| Yes | 26 | |
| 4 | No | 196 |
| Yes | 26 | |
| 5 | No | 71 |
| Yes | 7 |
res_yes_df1 = res_yes_df[res_yes_df['labels6'] == 1]
res_yes_df1.describe()
| AverageWeeklyHoursWorked | YearsAtCompany | TotalWorkingYears | YearsInRole | Age | YearsWithCurrManager | MonthlyIncome | labels6 | |
|---|---|---|---|---|---|---|---|---|
| count | 489.000000 | 489.000000 | 489.000000 | 489.000000 | 489.000000 | 489.000000 | 489.000000 | 489.0 |
| mean | 43.306748 | 3.940695 | 6.042945 | 2.492843 | 32.691207 | 2.480573 | 2571.952965 | 1.0 |
| std | 5.809826 | 3.276619 | 4.437198 | 2.560027 | 8.714315 | 2.677343 | 522.653393 | 0.0 |
| min | 40.000000 | 0.000000 | 0.000000 | 0.000000 | 18.000000 | 0.000000 | 1009.000000 | 1.0 |
| 25% | 40.000000 | 1.000000 | 3.000000 | 0.000000 | 27.000000 | 0.000000 | 2267.000000 | 1.0 |
| 50% | 40.000000 | 3.000000 | 6.000000 | 2.000000 | 31.000000 | 2.000000 | 2559.000000 | 1.0 |
| 75% | 48.000000 | 5.000000 | 8.000000 | 3.000000 | 37.000000 | 3.000000 | 2909.000000 | 1.0 |
| max | 70.000000 | 20.000000 | 20.000000 | 13.000000 | 59.000000 | 17.000000 | 3622.000000 | 1.0 |
res_yes_df1.describe(include=np.object)
/var/folders/bg/9bc3wg_14jl4dwfytmh7brfh0000gn/T/ipykernel_22464/461098861.py:1: DeprecationWarning: `np.object` is a deprecated alias for the builtin `object`. To silence this warning, use `object` by itself. Doing this will not modify any behavior and is safe. Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
| Resigned | OverTime | MaritalStatus | BusinessTravel | BusinessUnit | |
|---|---|---|---|---|---|
| count | 489 | 489 | 489 | 489 | 489 |
| unique | 2 | 2 | 3 | 3 | 3 |
| top | No | No | Married | Travel_Rarely | Consultants |
| freq | 361 | 352 | 209 | 353 | 382 |
Figure 6
fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(10,5), sharex=True,
sharey=True)
# creating a scatterplot of the clustered data on the first sub-axes
sns.scatterplot(data=res_yes_df1, x='AverageWeeklyHoursWorked',
y='MonthlyIncome',
hue='Resigned', palette='colorblind', ax=axes[0], alpha=0.8)
# creating another scatter plot of the centroids on the first sub-axes
# replace hue=range() with the parameter k
sns.scatterplot(data=res_yes_df, x='AverageWeeklyHoursWorked',
y='MonthlyIncome',
hue='Resigned', palette='colorblind', ax=axes[1],
alpha=0.8)
# # setting the title for the first sub-axes
# # add the parameter k to the title
# axes[0].set_title(f'Clustered data with k={k}')
# creating a scatterplot of the true labeled data on the second sub-axes
sns.scatterplot(data=res_yes_df, x='AverageWeeklyHoursWorked',
y='MonthlyIncome',
hue='labels6', palette='colorblind', ax=axes[2]
,alpha=0.8)
# setting the title for the second sub-axes
axes[1].set_title('Original data')
plt.show()
Figure 7
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(10,5), sharex=True,
sharey=True)
# creating a scatterplot of the clustered data on the first sub-axes
sns.scatterplot(data=res_yes_df, x='AverageWeeklyHoursWorked', y='Age',
hue='labels6', palette='colorblind', ax=axes[0],
alpha=0.8)
# creating a scatterplot of the true labeled data on the second sub-axes
sns.scatterplot(data=res_yes_df, x='AverageWeeklyHoursWorked',
y='Age',
hue='Resigned', palette='colorblind', ax=axes[1],
alpha=0.8)
# setting the title for the second sub-axes
axes[1].set_title('Original data')
plt.show()
Remuneration = onehot[['OverTime_Yes', 'MonthlyIncome', 'Resigned_Yes',
'Age']]
# standardizing the data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
Remuneration_scaled = scaler.fit_transform(Remuneration)
#find optimal number of clusters:
wcssa = []
for i in range(1,10):
kmeans = KMeans(n_clusters = i, init = 'k-means++', random_state = 0)
kmeans.fit(Remuneration_scaled)
wcssa.append(kmeans.inertia_)
plt.plot(range(1,10),wcssa)
plt.show()
# k means using 6 clusters and k-means++ initialization
model = KMeans(n_clusters = 4, init='k-means++', random_state = student_no)
rres = model.fit(Remuneration_scaled)
Remuneration["cluster"] = rres.predict(Remuneration_scaled)
Remuneration["cluster"].value_counts()
/var/folders/bg/9bc3wg_14jl4dwfytmh7brfh0000gn/T/ipykernel_22464/2639930091.py:4: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
1 352 2 175 0 155 3 138 Name: cluster, dtype: int64
Figure 8
#this is a scatterplot of the original dataframe Monthly
#Income vs Age with the Resigned status highlighted in red
fig, ax = plt.subplots()
colors = {0.0:'cornflowerblue', 1.0: 'red'}
ax.scatter(onehot['Age'], onehot['MonthlyIncome'],
c=onehot['Resigned_Yes'].map(colors))
plt.show()
Figure 9
#the below plots is the result of K-means clustering analysis for
#Monthly Income vs Age
#the black colour shows a pattern similar to that of the 'resigned'
#variable indicating the model has identified a cluster of employees
#that would be
#likely to resign based on Age and Income as inputs
fig, ax = plt.subplots(1)
# plot clustering result
plt.scatter(Remuneration.iloc[:,3],
Remuneration.iloc[:,1],
c=Remuneration.cluster,
alpha=0.8)
plt.show()
The light coloured dots correspond to the red dots of the 'Resigned' variable in the first tplot
R0 = Remuneration[Remuneration['cluster'] == 0]
R0.describe()
| OverTime_Yes | MonthlyIncome | Resigned_Yes | Age | cluster | |
|---|---|---|---|---|---|
| count | 155.0 | 155.000000 | 155.0 | 155.000000 | 155.0 |
| mean | 1.0 | 5325.722581 | 0.0 | 35.961290 | 0.0 |
| std | 0.0 | 2470.199928 | 0.0 | 7.849133 | 0.0 |
| min | 1.0 | 1129.000000 | 0.0 | 19.000000 | 0.0 |
| 25% | 1.0 | 3535.000000 | 0.0 | 30.000000 | 0.0 |
| 50% | 1.0 | 4930.000000 | 0.0 | 35.000000 | 0.0 |
| 75% | 1.0 | 6563.500000 | 0.0 | 41.000000 | 0.0 |
| max | 1.0 | 13603.000000 | 0.0 | 60.000000 | 0.0 |
R1 = Remuneration[Remuneration['cluster'] == 1]
R1.describe()
| OverTime_Yes | MonthlyIncome | Resigned_Yes | Age | cluster | |
|---|---|---|---|---|---|
| count | 352.0 | 352.000000 | 352.0 | 352.000000 | 352.0 |
| mean | 0.0 | 3857.948864 | 0.0 | 32.286932 | 1.0 |
| std | 0.0 | 1523.547554 | 0.0 | 5.840721 | 0.0 |
| min | 0.0 | 1051.000000 | 0.0 | 18.000000 | 1.0 |
| 25% | 0.0 | 2550.750000 | 0.0 | 28.000000 | 1.0 |
| 50% | 0.0 | 3630.000000 | 0.0 | 32.000000 | 1.0 |
| 75% | 0.0 | 4821.500000 | 0.0 | 36.000000 | 1.0 |
| max | 0.0 | 8639.000000 | 0.0 | 48.000000 | 1.0 |
R3 = Remuneration[Remuneration['cluster'] == 3]
R3.describe()
| OverTime_Yes | MonthlyIncome | Resigned_Yes | Age | cluster | |
|---|---|---|---|---|---|
| count | 138.000000 | 138.000000 | 138.0 | 138.000000 | 138.0 |
| mean | 0.514493 | 3999.586957 | 1.0 | 31.804348 | 3.0 |
| std | 0.501611 | 2473.391917 | 0.0 | 8.414325 | 0.0 |
| min | 0.000000 | 1081.000000 | 1.0 | 18.000000 | 3.0 |
| 25% | 0.000000 | 2336.750000 | 1.0 | 26.250000 | 3.0 |
| 50% | 1.000000 | 2897.500000 | 1.0 | 30.500000 | 3.0 |
| 75% | 1.000000 | 5287.500000 | 1.0 | 35.000000 | 3.0 |
| max | 1.000000 | 13610.000000 | 1.0 | 56.000000 | 3.0 |